Importing the Libraries:
In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
Reading the dataset
In [15]:
df = pd.read_csv('accident.csv')
df.head(10) #Specifies first 10 values of the dataset
Out[15]:
| Accident_ID | State | Date | Time | Reason | Number_of_Deaths | Number_of_Injuries | Road_Type | Weather_Conditions | Alcohol_Involved | Driver_Fatigue | Road_Conditions | Speed_Limit | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1001 | Andhra Pradesh | 01-05-2021 | 15:30:00 | Speeding | 2 | 3 | Rural | Sunny | No | No | Poor | 60.0 |
| 1 | 1002 | Karnataka | 02-05-2021 | 18:45:00 | Drunk Driving | 1 | 4 | Urban | Rainy | Yes | No | Good | 40.0 |
| 2 | 1003 | Delhi | 03-05-2021 | 10:15:00 | Poor Road Conditions | 0 | 2 | Urban | Foggy | No | No | Poor | 50.0 |
| 3 | 1004 | Maharashtra | 04-05-2021 | 06:00:00 | Driver Fatigue | 3 | 6 | Rural | Sunny | No | Yes | Good | 70.0 |
| 4 | 1005 | Uttar Pradesh | 05-05-2021 | 21:20:00 | Speeding | 1 | 2 | Rural | Sunny | No | No | Fair | 80.0 |
| 5 | 1006 | Tamil Nadu | 06-05-2021 | 12:00:00 | Drunk Driving | 0 | 1 | Urban | Sunny | Yes | No | Good | 30.0 |
| 6 | 1007 | Rajasthan | 07-05-2021 | 14:30:00 | Poor Road Conditions | 2 | 5 | Rural | Rainy | No | No | Poor | 60.0 |
| 7 | 1008 | West Bengal | 08-05-2021 | 08:00:00 | Driver Fatigue | 1 | 3 | Urban | Sunny | No | Yes | Fair | 50.0 |
| 8 | 1009 | Gujarat | 09-05-2021 | 17:30:00 | Speeding | 0 | 4 | Rural | Sunny | No | No | Good | 60.0 |
| 9 | 1010 | Assam | 10-05-2021 | 11:45:00 | Poor Road Conditions | 1 | 2 | Urban | Foggy | No | No | Poor | 40.0 |
Checking the size of the dataset
In [16]:
df.shape
Out[16]:
(300, 13)
Finding the data about the columns and their datatypes
In [17]:
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 300 entries, 0 to 299 Data columns (total 13 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Accident_ID 300 non-null int64 1 State 300 non-null object 2 Date 300 non-null object 3 Time 300 non-null object 4 Reason 300 non-null object 5 Number_of_Deaths 300 non-null int64 6 Number_of_Injuries 300 non-null int64 7 Road_Type 300 non-null object 8 Weather_Conditions 300 non-null object 9 Alcohol_Involved 300 non-null object 10 Driver_Fatigue 300 non-null object 11 Road_Conditions 300 non-null object 12 Speed_Limit 299 non-null float64 dtypes: float64(1), int64(3), object(9) memory usage: 30.6+ KB
Checking for Null values
In [18]:
df.isnull().sum()
Out[18]:
Accident_ID 0 State 0 Date 0 Time 0 Reason 0 Number_of_Deaths 0 Number_of_Injuries 0 Road_Type 0 Weather_Conditions 0 Alcohol_Involved 0 Driver_Fatigue 0 Road_Conditions 0 Speed_Limit 1 dtype: int64
Dealing with missing values
In [19]:
#We could see 1 null value in the speed limit , hence to fill a value in it let us know more abou the Speed_Limit column
df['Speed_Limit'].describe()
Out[19]:
count 299.000000 mean 56.956522 std 13.047466 min 30.000000 25% 50.000000 50% 60.000000 75% 60.000000 max 90.000000 Name: Speed_Limit, dtype: float64
In [20]:
#After going throught the data in excel, Median value can be apt in this situation
Speed_Limit_median = 60.000000 # Median value from your data info
df['Speed_Limit'].fillna(Speed_Limit_median)
Out[20]:
0 60.0
1 40.0
2 50.0
3 70.0
4 80.0
...
295 70.0
296 50.0
297 40.0
298 60.0
299 70.0
Name: Speed_Limit, Length: 300, dtype: float64
In [21]:
df.nunique()
Out[21]:
Accident_ID 300 State 27 Date 210 Time 127 Reason 7 Number_of_Deaths 6 Number_of_Injuries 7 Road_Type 2 Weather_Conditions 5 Alcohol_Involved 2 Driver_Fatigue 2 Road_Conditions 3 Speed_Limit 7 dtype: int64
Reasons For Road Accidents
In [22]:
import seaborn as sns
# Convert all values in the 'Reason' column to Uppercase
df['Reason'] = df['Reason'].str.upper()
# Calculate the number of accidents for each reason
reason_counts = df['Reason'].value_counts()
# Define the colors for the pie chart using a seaborn color palette
colors = sns.color_palette('Spectral', len(reason_counts))
# Plot the data as a pie chart
plt.figure(figsize=(5, 5)) # Set the size of the figure
plt.pie(reason_counts, labels=reason_counts.index, colors=colors, autopct='%1.1f%%', startangle=90)
plt.title('Reasons for Accidents', fontsize=18)
plt.axis('equal') # Make the pie chart circular
plt.show()
Accident Analysis by State
In [23]:
# Group accidents by state and count them
state_accidents = df.groupby('State')['Accident_ID'].count().reset_index()
# Sort states by the number of accidents
sorted_states = state_accidents.sort_values(by='Accident_ID', ascending=False)
# Display the top 10 states with the most accidents
top_states = sorted_states.head(10)
#print(top_states)
# Define the color palette using seaborn's color map
palette = sns.color_palette("Spectral", len(top_states))
# Create the horizontal bar graph
plt.figure(figsize=(8,4 ))
sns.barplot(x='Accident_ID', y='State', data=top_states, palette=palette, hue='State', legend=False)
plt.title('Number of Accidents by State (Top 10)', fontsize=18)
plt.xlabel('Number of Accidents')
plt.ylabel('State')
plt.show()
Impact of Weather Condition on Road Accidents
In [24]:
# Group accidents by weather condition and count them
weather_accidents = df.groupby('Weather_Conditions')['Accident_ID'].count().reset_index()
# Sort weather conditions by the number of accidents
sorted_weather = weather_accidents.sort_values(by='Accident_ID', ascending=True)
# Display the weather conditions with the most accidents
top_weather = sorted_weather.head(10)
#print(top_weather)
# Define the color palette using seaborn's viridis color map
palette = sns.color_palette("viridis", len(top_weather))
# Set the style and context
sns.set(style="whitegrid", context="talk")
# Create the vertical bar graph
plt.figure(figsize=(8, 6))
barplot = sns.barplot(x='Weather_Conditions', y='Accident_ID', data=top_weather, palette=palette, hue='Accident_ID', legend=False)
# Add data labels to each bar
for index, value in enumerate(top_weather['Accident_ID']):
plt.text(index, value, str(value), color='black', ha="center", va="bottom",fontsize=12)
# Add titles and labels
plt.title('Number of Accidents by Weather Condition', fontsize=18)
plt.xlabel('Weather Condition', fontsize=14)
plt.ylabel('Number of Accidents', fontsize=14)
plt.xticks(rotation=45) # Rotate the x-axis labels for better readability
plt.show()
Impact of Speeding
In [25]:
# Group data by speed limit and calculate the average number of deaths and injuries
speed_stats = df.groupby('Speed_Limit', as_index=False)['Number_of_Deaths'].mean()
# Set the style and context
sns.set(style="whitegrid", context="talk")
# Create the line plot
plt.figure(figsize=(7, 4))
sns.lineplot(data=speed_stats, x='Speed_Limit', y='Number_of_Deaths', marker='o', label='Average Number of Deaths', color='green')
# Add titles and labels
plt.title('Impact of Speeding on Accident Severity', fontsize=18)
plt.xlabel('Speed Limit', fontsize=14)
plt.ylabel('Average Number of Deaths', fontsize=14)
plt.legend()
plt.show()
Alcohol Involved Accidents(State-wise)
In [26]:
import plotly.express as px
# Filter the data to include only accidents with alcohol involvement
alcohol_accidents_df = df[df['Alcohol_Involved'] == 'Yes']
# Count the number of alcohol-related accidents in each state
state_counts = alcohol_accidents_df['State'].value_counts().reset_index()
state_counts.columns = ['State', 'Number_of_Accidents']
# Create a bar plot of the state-wise alcohol-related accidents
fig = px.bar(state_counts, x='State', y='Number_of_Accidents',
title='Alcohol-Related Accidents by State',
labels={'Number_of_Accidents': 'Number of Accidents'},
color='Number_of_Accidents',color_continuous_scale= px.colors.sequential.Inferno)
# Rotate the x-tick labels
fig.update_layout(xaxis_tickangle=-90, xaxis_tickfont=dict(size=8))
# Display the plot
fig.show()
In [27]:
# Create a new column to classify the accidents as rural or urban based on the road type
df['Location_Type'] = df['Road_Type'].apply(lambda x: 'Rural' if x.startswith('R') else 'Urban')
# Count the number of accidents by location type
location_counts = df['Location_Type'].value_counts().reset_index()
location_counts.columns = ['Location_Type', 'Count']
# Define the colors
colors = ['#FDFF00', ' #C21807']
# Create the pie chart
fig = px.pie(location_counts, values='Count', names='Location_Type',
title='Accidents by Location Type', color_discrete_sequence=colors,
hole=0.6, labels={'Count': 'Number of Accidents'})
# Update the traces to add a border
fig.update_traces(marker=dict(line=dict(color='#000000', width=1)))
# Display the plot
fig.show()
Visualizing the Number of Accidents in the 2021, 2022, 2023
In [28]:
import plotly.graph_objects as go
# Ensure that the date column is in datetime format with day first
df['Date'] = pd.to_datetime(df['Date'], dayfirst=True)
# Extract the year and month from the date
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
# Group data by year and month, and count the number of accidents
monthly_accidents = df.groupby(['Year', 'Month']).size().reset_index(name='Number_of_Accidents')
# Define the color palette
colors = px.colors.qualitative.Plotly
# Create a figure object
fig = go.Figure()
# Add traces for each year in descending order
years = sorted(monthly_accidents['Year'].unique(), reverse=True)
for i, year in enumerate(years):
yearly_data = monthly_accidents[monthly_accidents['Year'] == year]
fig.add_trace(go.Scatter(
x=yearly_data['Month'],
y=yearly_data['Number_of_Accidents'],
mode='lines+markers',
name=str(year),
visible=(i == 0) # Show only the first year's trace initially
))
# Create the dropdown menu with years in descending order
dropdown_buttons = [
dict(label='All Years',
method='update',
args=[{'visible': [True] * len(years)},
{'title': 'Monthly Accidents Over Years'}])
]
# Add buttons for each year in descending order
for j, year in enumerate(years):
visibility = [j == k for k in range(len(years))]
dropdown_buttons.append(
dict(label=str(year),
method='update',
args=[{'visible': visibility},
{'title': f'Monthly Accidents in {year}'}])
)
# Update layout to add dropdown menu
fig.update_layout(
title='Monthly Accidents Over Years',
xaxis_title='Month',
yaxis_title='Number of Accidents',
updatemenus=[dict(
active=0,
buttons=dropdown_buttons,
x=0.1,
y=1.15,
xanchor='left',
yanchor='top'
)]
)
# Display the plot
fig.show()
In [29]:
import matplotlib.animation as animation
from IPython.display import HTML
# Group data by month and calculate the sum of deaths and injuries
monthly_totals = df.groupby('Month').agg({'Number_of_Deaths': 'sum', 'Number_of_Injuries': 'sum'}).reset_index()
# Calculate the total number of deaths and injuries
monthly_totals['Total'] = monthly_totals['Number_of_Deaths'] + monthly_totals['Number_of_Injuries']
In [30]:
# Use interactive mode for Jupyter Notebook
%matplotlib notebook
# Set up the figure and axis
fig, ax = plt.subplots(figsize=(8, 4))
# Function to initialize the plot
def init():
ax.clear()
ax.set_xlim(1, 12)
ax.set_ylim(0, monthly_totals['Total'].max() + 10)
ax.set_xlabel('Month')
ax.set_ylabel('Total Number of Deaths and Injuries')
ax.set_title('Total Number of Deaths and Injuries by Month')
return ax
# Function to animate the plot
def animate(i):
ax.clear()
ax.set_xlim(1, 12)
ax.set_ylim(0, monthly_totals['Total'].max() + 10)
ax.set_xlabel('Month')
ax.set_ylabel('Total Number of Deaths and Injuries')
ax.set_title('Total Number of Deaths and Injuries by Month')
ax.bar(monthly_totals['Month'].iloc[:i+1], monthly_totals['Total'].iloc[:i+1], color='#FFA07A')
return ax
# Create the animation
ani = animation.FuncAnimation(fig, animate, init_func=init, frames=len(monthly_totals), interval=500, repeat=False)
# Display the animation in the notebook
HTML(ani.to_jshtml())
Out[30]:
In [ ]: